# Let's read our data
data = read.csv("C:/Users/kriti/Downloads/fake_job_postings.csv", header = TRUE, na.strings = c("","NA"))
head(data)
table(data$fraudulent)
0 1
17014 866
#taking 0 : 2000 , 1 : 866
sample_0 <- data[data$fraudulent == 0, ]
set.seed(123)
sample_0 <- sample_0[sample(nrow(sample_0), 2000), ]
sample_1 <- data[data$fraudulent == 1, ]
data <- rbind(sample_0, sample_1)
table(data$fraudulent)
0 1
2000 866
data = data[sample(nrow(data)), ]
index <- createDataPartition(y = data$fraudulent , p=0.9 , list = FALSE)
train_data <- data[index , ]
test_data <- data[-index , ]
#train_data
count = table(train_data$fraudulent)
# Using Bar plot
barplot(count,
main = "Count of Fraudulent vs. Non-Fraudulent (Train Data)",
xlab = "Fraudulent",
ylab = "Count",
col = c("lightblue", "pink"),
legend = c("Not Fraud", "Fraud"))
#test_data
count = table(test_data$fraudulent)
# Using Bar plot
barplot(count,
main = "Count of Fraudulent vs. Non-Fraudulent (Test Data)",
xlab = "Fraudulent",
ylab = "Count",
col = c("lightblue", "pink"),
legend = c("Not Fraud", "Fraud"))
summary(train_data)
job_id title location department salary_range
Min. : 1 Length:2580 Length:2580 Length:2580 Length:2580
1st Qu.: 4812 Class :character Class :character Class :character Class :character
Median : 9108 Mode :character Mode :character Mode :character Mode :character
Mean : 9502
3rd Qu.:14596
Max. :17876
company_profile description requirements benefits telecommuting
Length:2580 Length:2580 Length:2580 Length:2580 Min. :0.00000
Class :character Class :character Class :character Class :character 1st Qu.:0.00000
Mode :character Mode :character Mode :character Mode :character Median :0.00000
Mean :0.05194
3rd Qu.:0.00000
Max. :1.00000
has_company_logo has_questions employment_type required_experience required_education
Min. :0.0000 Min. :0.0000 Length:2580 Length:2580 Length:2580
1st Qu.:0.0000 1st Qu.:0.0000 Class :character Class :character Class :character
Median :1.0000 Median :0.0000 Mode :character Mode :character Mode :character
Mean :0.6709 Mean :0.4391
3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.0000
industry function. fraudulent
Length:2580 Length:2580 Min. :0.0000
Class :character Class :character 1st Qu.:0.0000
Mode :character Mode :character Median :0.0000
Mean :0.2996
3rd Qu.:1.0000
Max. :1.0000
summary(test_data)
job_id title location department salary_range
Min. : 27 Length:286 Length:286 Length:286 Length:286
1st Qu.: 4994 Class :character Class :character Class :character Class :character
Median : 8951 Mode :character Mode :character Mode :character Mode :character
Mean : 9348
3rd Qu.:14298
Max. :17823
company_profile description requirements benefits telecommuting
Length:286 Length:286 Length:286 Length:286 Min. :0.00000
Class :character Class :character Class :character Class :character 1st Qu.:0.00000
Mode :character Mode :character Mode :character Mode :character Median :0.00000
Mean :0.02797
3rd Qu.:0.00000
Max. :1.00000
has_company_logo has_questions employment_type required_experience required_education
Min. :0.0000 Min. :0.000 Length:286 Length:286 Length:286
1st Qu.:0.0000 1st Qu.:0.000 Class :character Class :character Class :character
Median :1.0000 Median :0.000 Mode :character Mode :character Mode :character
Mean :0.6434 Mean :0.472
3rd Qu.:1.0000 3rd Qu.:1.000
Max. :1.0000 Max. :1.000
industry function. fraudulent
Length:286 Length:286 Min. :0.0000
Class :character Class :character 1st Qu.:0.0000
Mode :character Mode :character Median :0.0000
Mean :0.3252
3rd Qu.:1.0000
Max. :1.0000
# Let's check for null values
colSums(is.na(data))
job_id title location department salary_range
0 0 54 1804 2337
company_profile description requirements benefits telecommuting
907 0 457 1186 0
has_company_logo has_questions employment_type required_experience required_education
0 0 619 1237 1377
industry function. fraudulent
824 1081 0
Here, We can see that many columns have null values, let’s handle them:
# Calculating the correlation for: department, salary_range,function.
##department
#Train Data
train_data$department = as.numeric(as.factor(train_data$department))
train_data$department = ifelse(is.na(train_data$department), 0, train_data$department)
correlation_matrix = cor(train_data$department, train_data$fraudulent)
correlation_matrix # -0.07768365
[1] -0.07354623
#Test_data
test_data$department = as.numeric(as.factor(test_data$department))
test_data$department = ifelse(is.na(test_data$department), 0, test_data$department)
correlation_matrix = cor(test_data$department, test_data$fraudulent)
correlation_matrix #-0.02963282
[1] -0.0786168
##salary Range
#Train Data
train_data$salary_range = as.numeric(as.factor(train_data$salary_range))
train_data$salary_range = ifelse(is.na(train_data$salary_range), 0, train_data$salary_range)
correlation_matrix = cor(train_data$salary_range, train_data$fraudulent)
correlation_matrix #0.1017228
[1] 0.101208
#Test Data
test_data$salary_range = as.numeric(as.factor(test_data$salary_range))
test_data$salary_range = ifelse(is.na(test_data$salary_range), 0, test_data$salary_range)
correlation_matrix = cor(test_data$salary_range, test_data$fraudulent)
correlation_matrix #0.124986
[1] 0.1258764
##function.
#Train_data
train_data$function. = as.numeric(as.factor(train_data$function.))
train_data$function. = ifelse(is.na(train_data$function.), 0, train_data$function.)
correlation_matrix = cor(train_data$function., train_data$fraudulent)
correlation_matrix #-0.1476297
[1] -0.1522213
#Test Data
test_data$function. = as.numeric(as.factor(test_data$function.))
test_data$function. = ifelse(is.na(test_data$function.), 0, test_data$function.)
correlation_matrix = cor(test_data$function., test_data$fraudulent)
correlation_matrix #-0.180516
[1] -0.1438862
# The correlation value is very far away from the range -1 to 1, therefore it shows weak relation and we'll remove these variables from our dataset.
train_data = subset(train_data, select = -c(salary_range,department, function.))
test_data = subset(test_data, select = -c(salary_range,department, function.))
Let’s impute the missing values:
No numerical variables have null values.
data = data.frame(
lapply(train_data, function(x) ifelse(is.na(x), '', x))
)
colSums(is.na(train_data))
job_id title location company_profile description
0 0 43 818 0
requirements benefits telecommuting has_company_logo has_questions
409 1066 0 0 0
employment_type required_experience required_education industry fraudulent
570 1115 1249 747 0
count = table(train_data$telecommuting)
barplot(count,
main = "Count Plot for telecommuting",
xlab = "id",
ylab = "Count",
col = "skyblue",
border = "black"
)
count = table(train_data$has_company_logo)
barplot(count,
main = "Count Plot for company_logo",
xlab = "id",
ylab = "Count",
col = "skyblue",
border = "black"
)
count = table(train_data$has_questions)
barplot(count,
main = "Count Plot for questions",
xlab = "id",
ylab = "Count",
col = "skyblue",
border = "black"
)
There is high imbalance of data for telecommuting variable.
# For job_id
correlation <- cor(train_data$job_id, train_data$fraudulent)
correlation #0.1557648
[1] 0.1482325
# For telecommuting
correlation <- cor(train_data$telecommuting, train_data$fraudulent)
correlation #0.06998303
[1] 0.07569604
# For has_company_logo
correlation <- cor(train_data$has_company_logo, train_data$fraudulent)
correlation #-0.4789252
[1] -0.4819276
# For has_questions
correlation <- cor(train_data$has_questions, train_data$fraudulent)
correlation #-0.2056319
[1] -0.205375
# Removing these three variable from the data set
train_data = subset(train_data, select = -c(job_id,telecommuting,has_questions))
test_data = subset(test_data, select = -c(job_id,telecommuting,has_questions))
# Let's create a function to perform the chi-square test
chi_test = function(var, target) {
chi_sq = chisq.test(var, target)
return(chi_sq)
}
cat_var = c(
"title",
"location",
"company_profile",
"description",
"requirements",
"benefits",
"employment_type",
"required_experience",
"required_education",
"industry"
)
# Perform chi-square tests for each variable and print the results
for (var in cat_var) {
chi_sq = chi_test(train_data[[var]], train_data$fraudulent)
var
print(chi_sq)
}
Warning: Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: var and target
X-squared = 2466.1, df = 1960, p-value = 3.524e-14
Warning: Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: var and target
X-squared = 1741.2, df = 975, p-value < 2.2e-16
Warning: Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: var and target
X-squared = 1762, df = 660, p-value < 2.2e-16
Warning: Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: var and target
X-squared = 2580, df = 2277, p-value = 7.988e-06
Warning: Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: var and target
X-squared = 2160.3, df = 1812, p-value = 2.343e-08
Warning: Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: var and target
X-squared = 1506, df = 1141, p-value = 1.643e-12
Pearson's Chi-squared test
data: var and target
X-squared = 37.836, df = 4, p-value = 1.211e-07
Pearson's Chi-squared test
data: var and target
X-squared = 59.935, df = 6, p-value = 4.639e-11
Warning: Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: var and target
X-squared = 158.28, df = 11, p-value < 2.2e-16
Warning: Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: var and target
X-squared = 660.72, df = 106, p-value < 2.2e-16
We can see that the p-values are < 0.05, therefore we’ll accept that these are significant variables.
char_vars = sapply(train_data, is.character)
# Convert those variables to factor
#data[char_vars] = lapply(data[char_vars], as.factor)
#train_data
# Plotting the count for required_experience in the data:
experience_counts = table(train_data$required_experience)
top_experience_index = head(order(experience_counts, decreasing = TRUE), 10)
top_counts = experience_counts[top_experience_index]
barplot(top_counts,
main = "Count Plot for Experience",
xlab = "Experience",
ylab = "Count",
col = "skyblue",
border = "black",
las = 2
)
We can deduce that most values are null and then the hiring is happening more for senior level and entry level as compared to other types of experience needed.
# Let's extract country from the location and check it's count
train_data$country <- sub(",.*", "", train_data$location)
test_data$country <- sub(",.*", "", test_data$location)
# Plotting the count for 10 highest counts countries from the data:
country_counts = table(train_data$country)
top_countries_index = head(order(country_counts, decreasing = TRUE), 10)
top_counts = country_counts[top_countries_index]
barplot(top_counts,
main = "Count Plot for Countries",
xlab = "Countries",
ylab = "Count",
col = "skyblue",
border = "black",
las = 2
)
#train_data$country = as.factor(train_data$country)
US has the most openings.
# Plotting the count for highest counts education from the data:
ed_counts = table(train_data$required_education)
ed_index = head(order(ed_counts, decreasing = TRUE), 5)
top_counts = ed_counts[ed_index]
barplot(top_counts,
main = "Count Plot for Education Level",
xlab = "Education",
ylab = "Count",
col = "skyblue",
border = "black",
las = 2
)
Again for this, Bachelor’s degree requirement is more.
chi_test <- function(var, target) {
chi_sq <- chisq.test(var, target)
return(chi_sq$statistic)
}
cat_var <- c(
"title",
"country",
"company_profile",
"description",
"requirements",
"benefits",
"employment_type",
"required_experience",
"required_education",
"industry",
"has_company_logo"
)
# Initializing an empty vector to store chi-square statistics for each variable
chi_sq_stats <- numeric(length(cat_var))
# Performing chi-square tests for each variable and storing the results
for (i in seq_along(cat_var)) {
chi_sq_stats[i] <- chi_test(train_data[[cat_var[i]]], train_data$fraudulent)
}
Warning: Chi-squared approximation may be incorrectWarning: Chi-squared approximation may be incorrectWarning: Chi-squared approximation may be incorrectWarning: Chi-squared approximation may be incorrectWarning: Chi-squared approximation may be incorrectWarning: Chi-squared approximation may be incorrectWarning: Chi-squared approximation may be incorrectWarning: Chi-squared approximation may be incorrect
# New data frame with variable names and their corresponding chi-square statistics created
chi_sq_df <- data.frame(variable = cat_var, chi_sq_statistic = chi_sq_stats)
# Ordering based on chi-square statistics in descending order
chi_sq_df <- chi_sq_df[order(chi_sq_df$chi_sq_statistic, decreasing = TRUE), ]
chi_sq_df
We can discard country, location, required_education ,required_experience ,employment_type, location from the dataset.
#train_data
train_data = subset(train_data, select = -c(country, location, required_education ,required_experience ,employment_type, location,has_company_logo ))
train_data
#test_data
#train_data
test_data = subset(test_data, select = -c(country, location, required_education ,required_experience ,employment_type, location,has_company_logo ))
test_data
train_data$text <- paste(train_data$title, train_data$company_profile, train_data$description, train_data$requirements, train_data$benefits, train_data$industry, sep = " ")
test_data$text <- paste(test_data$title, test_data$company_profile, test_data$description, test_data$requirements, test_data$benefits, test_data$industry, sep = " ")
# Removing these variables:
#train_data
train_data = subset(train_data, select = -c(title, company_profile, description, requirements,benefits,industry))
#test_data
test_data = subset(test_data, select = -c(title, company_profile, description, requirements,benefits,industry))
set.seed(123)
train_data = train_data[sample(nrow(train_data)), ]
test_data = test_data[sample(nrow(test_data)), ]
library(tm)
library(wordcloud)
library(SnowballC)
text_corpus_train = VCorpus(VectorSource(train_data$text))
text_corpus_test = VCorpus(VectorSource(test_data$text))
#train_data
text_corpus_clean_train <- tm_map(text_corpus_train, content_transformer(tolower)) # Convert to lowercase
text_corpus_clean_train <- tm_map(text_corpus_clean_train, removeNumbers) # Remove numbers
text_corpus_clean_train <- tm_map(text_corpus_clean_train,removeWords, stopwords()) # Remove stop words
text_corpus_clean_train <- tm_map(text_corpus_clean_train, removePunctuation) # Remove punctuation
text_corpus_clean_train <- tm_map(text_corpus_clean_train, stemDocument) # Stemming
text_corpus_clean_train <- tm_map(text_corpus_clean_train, stripWhitespace) # Remove extra whitespaces
#test_data
text_corpus_clean_test <- tm_map(text_corpus_test, content_transformer(tolower)) # Convert to lowercase
text_corpus_clean_test <- tm_map(text_corpus_clean_test, removeNumbers) # Remove numbers
text_corpus_clean_test <- tm_map(text_corpus_clean_test,removeWords, stopwords()) # Remove stop words
text_corpus_clean_test <- tm_map(text_corpus_clean_test, removePunctuation) # Remove punctuation
text_corpus_clean_test <- tm_map(text_corpus_clean_test, stemDocument) # Stemming
text_corpus_clean_test <- tm_map(text_corpus_clean_test, stripWhitespace) # Remove extra whitespaces
#train_data
wordcloud(text_corpus_clean_train,min.freq = 50, random.order = FALSE)
fraudulent = subset(text_corpus_clean_train, train_data$fraudulent == 1)
non_fraudulent = subset(text_corpus_clean_train, train_data$fraudulent == 0)
wordcloud(fraudulent, max.words = 20, scale = c(3, 0.5))
wordcloud(non_fraudulent, max.words = 20, scale = c(3, 0.5))
#test_data
wordcloud(text_corpus_clean_test,min.freq = 50, random.order = FALSE)
fraudulent = subset(text_corpus_clean_test, test_data$fraudulent == 1)
non_fraudulent = subset(text_corpus_clean_train, test_data$fraudulent == 0)
wordcloud(fraudulent, max.words = 20, scale = c(3, 0.5))
wordcloud(non_fraudulent, max.words = 20, scale = c(3, 0.5))
#train_data
train_data_dtm = DocumentTermMatrix(text_corpus_clean_train)
train_data_dtm
<<DocumentTermMatrix (documents: 2580, terms: 42872)>>
Non-/sparse entries: 389772/110219988
Sparsity : 100%
Maximal term length: 517
Weighting : term frequency (tf)
#test_data
test_data_dtm = DocumentTermMatrix(text_corpus_clean_test)
test_data_dtm
<<DocumentTermMatrix (documents: 286, terms: 8819)>>
Non-/sparse entries: 41206/2481028
Sparsity : 98%
Maximal term length: 630
Weighting : term frequency (tf)
index = round(nrow(train_data_dtm)*0.70)
train_dtm = train_data_dtm[1:index,]
test_dtm = train_data_dtm[(index+1):nrow(train_data_dtm),]
train_label = train_data[1:index,]$fraudulent
test_label = train_data[(index+1):nrow(train_data),]$fraudulent
#checking the length.
nrow(train_dtm)
[1] 1806
length(train_label)
[1] 1806
nrow(test_dtm)
[1] 774
length(test_label)
[1] 774
#simple benchmark model
#sum(test_data$fraudulent == 1)
length(test_data$fraudulent)
[1] 286
vector <- rep(1 , 286)
pred_table <- table(vector , test_data$fraudulent)
pred_table
vector 0 1
1 193 93
total <- 193 + 93
TP<-0
TN<-193
FP<-0
FN<-93
accuracy <- (TP + TN) / total
accuracy #0.6748252
[1] 0.6748252
#train_data
textFreqWords_train = findFreqTerms(train_dtm,10)
train_dtm_freq = train_dtm[,textFreqWords_train]
train_dtm_freq
<<DocumentTermMatrix (documents: 1806, terms: 3217)>>
Non-/sparse entries: 224755/5585147
Sparsity : 96%
Maximal term length: 252
Weighting : term frequency (tf)
#Test_data
textFreqWords_test = findFreqTerms(test_dtm,10)
test_dtm_freq = test_dtm[,textFreqWords_test]
test_dtm_freq
<<DocumentTermMatrix (documents: 774, terms: 2006)>>
Non-/sparse entries: 92067/1460577
Sparsity : 94%
Maximal term length: 65
Weighting : term frequency (tf)
convertCounts = function(x) {
x = ifelse(x > 0, "Yes", "No")
}
train_text = apply(train_dtm_freq, MARGIN = 2, convertCounts)
test_text = apply(test_dtm_freq, MARGIN = 2, convertCounts)
#calculations for inverse of confusion matrix to find out the fraud cases.
confusion_matrix
$t
y
x 0 1
0 462 27
1 75 210
$prop.row
y
x 0 1
0 0.94478528 0.05521472
1 0.26315789 0.73684211
$prop.col
y
x 0 1
0 0.8603352 0.1139241
1 0.1396648 0.8860759
$prop.tbl
y
x 0 1
0 0.59689922 0.03488372
1 0.09689922 0.27131783
total <- 210+462+75+27
TP<-210
TN<-462
FP<-75
FN<-27
accuracy <- (TP + TN) / total
accuracy
[1] 0.8682171
recall <- TP/(TP+FN)
recall
[1] 0.8860759
precision<- TP/(TP+FP)
precision
[1] 0.7368421
F1_score <- 2*((precision*recall)/(precision+recall))
F1_score
[1] 0.8045977
str(train_label)
int [1:1806] 0 0 0 0 1 0 0 0 0 1 ...
set.seed(1)
library(caret)
library(gbm)
train_control = trainControl(method = "cv", number = 5)
gbm_model <- train(as.formula(textTrainLabels ~ .,),
data = textTrain_df,
method = "gbm",
trControl = train_control,
tuneLength = 5) # Number of models to evaluate
# Print the trained model
print(gbm_model)
# predicting our model
gbm_prediction = predict(gbm_model, textTest_df)
# Creating a confusion matrix
table(gbm_prediction, textTest_df)
set.seed(1)
library(caret)
library(gbm)
train_control = trainControl(method = "cv", number = 3)
rf_model <- train(as.formula(textTrainLabels ~ .,),
data = textTrain_df,
method = "rf",
trControl = train_control,
tuneLength = 3,
importance = TRUE)
print(rf_model)
# predicting our model
random_forest_prediction = predict(rf_model, textTest_df)
# Creating a confusion matrix
table(random_forest_prediction, textTest_df)
# handle imbalance dataset
# Add benchmark model